# -*- coding: utf-8 -*-
import re
from datetime import datetime

import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from SinaWeiboSpider.items import WeiboItem
from SinaWeiboSpider.spiders.utils import time_fix, extract_weibo_content


class WeiboAccountWeiboSpider(scrapy.Spider):
    name = 'sina_account_weibo'
    allowed_domains = ['weibo.cn']
    base_url = 'https://weibo.cn/'

    def start_requests(self):
        # start_uids = [
            # '2803301701',  # 人民日报
            # 'zjfangfang',  # 张亚飞
        # ]
        # for uid in start_uids:
        #     yield Request(url=f'{self.base_url}{uid}/profile?page=1', callback=self.parse, priority=1)
        for page in range(1, 15):
            yield Request(url=f'{self.base_url}zjfangfang?page={page}', callback=self.parse, priority=1)

    def parse(self, response):
        # if response.url.endswith('page=1'):
        #     # 如果是第1页，一次性获取后面的所有页
        #     all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
        #     if all_page:
        #         all_page = all_page.group(1)
        #         all_page = int(all_page)
        #         for page_num in range(2, all_page + 1):
        #             page_url = response.url.replace('page=1', 'page={}'.format(page_num))
        #             yield Request(page_url, self.parse, dont_filter=True, meta=response.meta)
        # 解析本页的数据
        weibo_nodes = response.xpath('//div[@class="c" and @id]')
        for weibo_node in weibo_nodes:
            weibo_item = WeiboItem()
            weibo_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %X')
            weibo_repost_url = weibo_node.xpath('.//a[contains(text(),"转发[")]/@href').extract_first()
            if weibo_repost_url:
                user_weibo_id = re.search(r'/repost/(.*?)\?uid=(\d+)', weibo_repost_url)
                weibo_item['weibo_url'] = 'https://weibo.com/{}/{}'.format(user_weibo_id.group(2),
                                                                           user_weibo_id.group(1))
                weibo_item['user_id'] = user_weibo_id.group(2)
                weibo_item['weibo_id'] = '{}_{}'.format(user_weibo_id.group(2), user_weibo_id.group(1))
            create_time_info_node = weibo_node.xpath('.//span[@class="ct"]')[-1]
            create_time_info = create_time_info_node.xpath('string(.)').extract_first()
            if "来自" in create_time_info:
                weibo_item['created_at'] = time_fix(create_time_info.split('来自')[0].strip())
                weibo_item['tool'] = create_time_info.split('来自')[1].strip()
            else:
                weibo_item['created_at'] = time_fix(create_time_info.strip())

            like_num = weibo_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1].extract()
            like_num = re.search('\d+', like_num)
            weibo_item['like_num'] = int(like_num.group()) if like_num else 0

            repost_num = weibo_node.xpath('.//a[contains(text(),"转发[")]/text()')
            repost_num = repost_num[-1].extract() if repost_num else '0'
            repost_num = re.search('\d+', repost_num)
            weibo_item['repost_num'] = int(repost_num.group())

            comment_num = weibo_node.xpath(
                './/a[contains(text(),"评论[") and not(contains(text(),"原文"))]/text()')[-1].extract()
            comment_num = re.search('\d+', comment_num)
            weibo_item['comment_num'] = int(comment_num.group()) if comment_num else 0

            images = weibo_node.xpath('.//img[@alt="图片"]/@src').extract()
            if images:
                weibo_item['image_url'] = images[0]

            videos = weibo_node.xpath(
                './/a[contains(@href,"https://m.weibo.cn/s/video/show?object_id=")]/@href').extract()
            if videos:
                weibo_item['video_url'] = videos[0]

            map_node = weibo_node.xpath('.//a[contains(text(),"显示地图")]')
            if map_node:
                map_node = map_node[0]
                map_node_url = map_node.xpath('./@href')[0].extract()
                map_info = re.search(r'xy=(.*?)&', map_node_url)
                map_info = map_info.group(1) if map_info else None
                weibo_item['location_map_info'] = map_info

            repost_node = weibo_node.xpath('.//a[contains(text(),"原文评论[")]/@href').extract()
            if repost_node:
                weibo_item['origin_weibo'] = repost_node[0]
            # 检测由没有阅读全文:
            all_content_link = weibo_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]')
            if all_content_link:
                all_content_url = self.base_url + all_content_link[0].xpath('./@href')[0].extract()
                yield Request(all_content_url, callback=self.parse_all_content, meta={'item': weibo_item},
                              priority=1)
            else:
                weibo_html = weibo_node.xpath('string(.)').extract_first()
                weibo_item['content'] = extract_weibo_content(weibo_html)
                yield weibo_item

    def parse_all_content(self, response):
        # 有阅读全文的情况，获取全文
        weibo_item = response.meta['item']
        weibo_html = response.xpath('string(//*[@id="M_"]/div[1])').extract_first()
        weibo_item['content'] = extract_weibo_content(weibo_html)
        yield weibo_item


if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('sina_account_weibo')
    process.start()